library(dplyr)
library(readr)
library(broom)
library(ggplot2)
library(tidymodels)
library(probably)
library(vip)
library(plotly)
library(ClusterR)
library(cluster)
library(vip)
tidymodels_prefer()
theme_set(theme_bw())
Sys.setlocale("LC_TIME", "English")
## [1] "English_United States.1252"
set.seed(74)
breastCa<-read_csv(file = "breast-cancer.csv")
breastCa_Re<-breastCa %>%
drop_na() %>%
select(radius_mean:fractal_dimension_mean)
breastCa_Re_new<-breastCa_Re%>%
mutate(concave_points_mean=`concave points_mean`)%>%
select(-8)
breastCa_Re_CV<-vfold_cv(breastCa_Re, v = 10)
#least square
lm_spec <-
linear_reg() %>%
set_engine(engine = 'lm') %>%
set_mode('regression')
#LASSO
lm_lasso_spec <-
linear_reg() %>%
set_args(mixture = 1, penalty = tune()) %>% ## mixture = 1 indicates Lasso
set_engine(engine = 'glmnet') %>% #note we are using a different engine
set_mode('regression')
#least square
least_rec <- recipe(area_mean ~ ., data = breastCa_Re) %>%
step_corr(all_predictors()) %>%
step_nzv(all_predictors()) %>% # removes variables with the same value
step_normalize(all_numeric_predictors()) %>% # important standardization step for LASSO
step_dummy(all_nominal_predictors())
least_lm_wf <- workflow() %>%
add_recipe(least_rec) %>%
add_model(lm_spec)
#LASSO
lasso_wf<- workflow() %>%
add_recipe(least_rec) %>%
add_model(lm_lasso_spec)
#least square
least_fit <- fit(least_lm_wf, data = breastCa_Re)
least_fit %>% tidy()
#LASSO
#tune
penalty_grid <- grid_regular(
penalty(range = c(-3, 1)), #log10 transformed
levels = 30)
tune_output <- tune_grid( # new function for tuning hyperparameters
lasso_wf, # workflow
resamples = breastCa_Re_CV, # cv folds
metrics = metric_set(rmse, mae),
grid = penalty_grid # penalty grid defined above
)
#fit
best_se_penalty <- select_by_one_std_err(tune_output, metric = 'mae', desc(penalty))
final_wf_se <- finalize_workflow(lasso_wf, best_se_penalty)
lasso_fit <- fit(final_wf_se , data = breastCa_Re)
lasso_fit %>% tidy()
# Least Square model
least_fit_cv <- fit_resamples(least_lm_wf,
resamples = breastCa_Re_CV,
metrics = metric_set(rmse, mae)
)
least_fit_cv %>% collect_metrics(summarize = TRUE)
# LASSO model
tune_output %>%
collect_metrics() %>%
filter(penalty == (best_se_penalty
%>% pull(penalty)))
#least square
least_fit_output <- least_fit %>%
predict(new_data = breastCa_Re) %>%
bind_cols(breastCa_Re) %>%
mutate(resid = area_mean - .pred)
ggplot(least_fit_output, aes(x = .pred, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
labs(x = "Fitted values", y = "Residuals") +
theme_classic()
# Residuals vs. predictors (x's)
ggplot(least_fit_output, aes(x = concavity_mean, y = resid)) +
geom_point() +
geom_smooth() +
labs(x = "Concavity mean", y = "Residual") +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
# Residuals vs. predictors (x's)
ggplot(least_fit_output, aes(x = compactness_mean, y = resid)) +
geom_point() +
geom_smooth() +
labs(x = "Compactness mean", y = "Residual") +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
# Residuals vs. predictors (x's)
ggplot(least_fit_output, aes(x = fractal_dimension_mean, y = resid)) +
geom_point() +
geom_smooth() +
labs(x = "Fractal dimension mean", y = "Residual") +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
# Residuals vs. predictors (x's)
ggplot(least_fit_output, aes(x = radius_mean, y = resid)) +
geom_point() +
geom_smooth() +
labs(x = "Radius mean", y = "Residual") +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
#LASSO
lasso_fit_output <- lasso_fit %>%
predict(new_data = breastCa_Re) %>%
bind_cols(breastCa_Re) %>%
mutate(resid = area_mean - .pred)
ggplot(lasso_fit_output, aes(x = .pred, y = resid)) +
geom_point() +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
labs(x = "Fitted values", y = "Residuals") +
theme_classic()
# Residuals vs. predictors (x's)
ggplot(lasso_fit_output, aes(x = concavity_mean, y = resid)) +
geom_point() +
geom_smooth() +
labs(x = "Concavity mean", y = "Residual") +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
# Residuals vs. predictors (x's)
ggplot(lasso_fit_output, aes(x = compactness_mean, y = resid)) +
geom_point() +
geom_smooth() +
labs(x = "Compactness mean", y = "Residual") +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
# Residuals vs. predictors (x's)
ggplot(lasso_fit_output, aes(x = fractal_dimension_mean, y = resid)) +
geom_point() +
geom_smooth() +
labs(x = "Fractal dimension mean", y = "Residual") +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
# Residuals vs. predictors (x's)
ggplot(lasso_fit_output, aes(x = radius_mean, y = resid)) +
geom_point() +
geom_smooth() +
labs(x = "Radius mean", y = "Residual") +
geom_hline(yintercept = 0, color = "red") +
theme_classic()
set.seed(123)
gam_spec <-
gen_additive_mod() %>%
set_engine(engine = 'mgcv') %>%
set_mode('regression')
gam_mod <- fit(gam_spec,
area_mean ~ s(radius_mean)+texture_mean+s(perimeter_mean)+smoothness_mean+compactness_mean+concave_points_mean+concavity_mean+symmetry_mean+fractal_dimension_mean,
data = breastCa_Re_new
)
par(mfrow=c(2,2))
gam_mod %>% pluck('fit') %>% mgcv::gam.check()
##
## Method: GCV Optimizer: magic
## Smoothing parameter selection converged after 19 iterations.
## The RMS GCV score gradient at convergence was 3.693889e-05 .
## The Hessian was positive definite.
## Model rank = 26 / 26
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(radius_mean) 9.00 8.73 0.91 0.015 *
## s(perimeter_mean) 9.00 9.00 0.96 0.170
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gam_mod %>% pluck('fit') %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## area_mean ~ s(radius_mean) + texture_mean + s(perimeter_mean) +
## smoothness_mean + compactness_mean + concave_points_mean +
## concavity_mean + symmetry_mean + fractal_dimension_mean
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 676.8849 9.4284 71.792 < 2e-16 ***
## texture_mean 0.2863 0.1136 2.521 0.012000 *
## smoothness_mean 176.4116 55.3592 3.187 0.001522 **
## compactness_mean -559.1059 41.8880 -13.348 < 2e-16 ***
## concave_points_mean -193.1237 55.0443 -3.509 0.000488 ***
## concavity_mean 75.1818 19.7185 3.813 0.000153 ***
## symmetry_mean 24.5876 21.8517 1.125 0.261001
## fractal_dimension_mean 193.2815 164.7137 1.173 0.241134
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(radius_mean) 8.726 8.982 79.72 <2e-16 ***
## s(perimeter_mean) 9.000 9.000 36.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.999 Deviance explained = 99.9%
## GCV = 115.47 Scale est. = 110.25 n = 569
ns_rec <- least_rec %>%
step_ns(x, deg_free = 9)
ns9_wf <- workflow() %>%
add_recipe(ns_rec) %>%
add_model(lm_spec)
hist(breastCa_Re_new$area_mean)
gam_mod %>% pluck('fit') %>% plot()
gam_test_output <- gam_mod %>%
predict(new_data=breastCa_Re_new) %>%
bind_cols(breastCa_Re_new %>% select(area_mean))
gam_test_output %>%
rmse(truth=area_mean, estimate=.pred)
gam_test_output %>%
mae(truth=area_mean, estimate=.pred)
breastCa_Re<-breastCa %>%
drop_na() %>%
select(-c(13:22)) %>%
select(-1)
breastCa_Re_new<-breastCa_Re%>%
mutate(concave_points_mean=`concave points_mean`)%>%
select(-10)
# Make sure you set reference level (to the outcome you are NOT interested in)
breastCa_Re_new2 <- breastCa_Re_new%>%
mutate(diagnosis = relevel(factor(diagnosis ), ref='B')) #set reference level
data_cv10 <- vfold_cv(breastCa_Re_new2, v = 10)
# Logistic LASSO Regression Model Spec
logistic_lasso_spec_tune <- logistic_reg() %>%
set_engine('glmnet') %>%
set_args(mixture = 1, penalty = tune()) %>%
set_mode('classification')
# Recipe
logistic_rec <- recipe(diagnosis ~ ., data = breastCa_Re_new2) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Workflow (Recipe + Model)
log_lasso_wf <- workflow() %>%
add_recipe(logistic_rec) %>%
add_model(logistic_lasso_spec_tune)
# Tune Model (trying a variety of values of Lambda penalty)
penalty_grid <- grid_regular(
penalty(range = c(-5, 1)), #log10 transformed (kept moving min down from 0)
levels = 100)
tune_output <- tune_grid(
log_lasso_wf, # workflow
resamples = data_cv10, # cv folds
metrics = metric_set(roc_auc,accuracy),
control = control_resamples(save_pred = TRUE, event_level = 'second'),
grid = penalty_grid # penalty grid defined above
)
# Visualize Model Evaluation Metrics from Tuning
autoplot(tune_output) + theme_classic()
best_se_penalty <- select_by_one_std_err(tune_output, metric = 'roc_auc', desc(penalty)) # choose penalty value based on the largest penalty within 1 se of the highest CV roc_auc
final_fit_se <- finalize_workflow(log_lasso_wf, best_se_penalty) %>% # incorporates penalty value to workflow
fit(data = breastCa_Re_new2)
final_fit_se %>% tidy()
final_fit_se %>% tidy() %>%
filter(estimate == 0)
#variable importance
glmnet_output <- final_fit_se %>% extract_fit_engine()
# Create a boolean matrix (predictors x lambdas) of variable exclusion
bool_predictor_exclude <- glmnet_output$beta==0
# Loop over each variable
var_imp <- sapply(seq_len(nrow(bool_predictor_exclude)), function(row) {
# Extract coefficient path (sorted from highest to lowest lambda)
this_coeff_path <- bool_predictor_exclude[row,]
# Compute and return the # of lambdas until this variable is out forever
ncol(bool_predictor_exclude) - which.min(this_coeff_path) + 1
})
# Create a dataset of this information and sort
var_imp_data <- tibble(
var_name = rownames(bool_predictor_exclude),
var_imp = var_imp
)
var_imp_data %>% arrange(desc(var_imp))
# CV results for "best lambda"
tune_output %>%
collect_metrics() %>%
filter(penalty == best_se_penalty %>% pull(penalty))
# Count up number of B and M in the training data
breastCa_Re_new2 %>%
count(diagnosis) # Name of the outcome variable goes inside count()
#Compute the NIR
NIR<- 357/(357+212)
NIR
## [1] 0.6274165
# Soft Predictions on Training Data
final_output <-
final_fit_se %>% predict(new_data = breastCa_Re_new2, type = 'prob') %>% bind_cols(breastCa_Re_new2)
final_output %>%
ggplot(aes(x = diagnosis, y = .pred_M)) +
geom_boxplot()
# Use soft predictions
final_output %>%
roc_curve(diagnosis,.pred_M,event_level = 'second') %>%
autoplot()
# thresholds in terms of reference level
threshold_output <- final_output %>%
threshold_perf(truth = diagnosis, estimate = .pred_B, thresholds = seq(0,1,by=.01))
# J-index v. threshold for not M
threshold_output %>%
filter(.metric == 'j_index') %>%
ggplot(aes(x = .threshold, y = .estimate)) +
geom_line() +
labs(y = 'J-index', x = 'threshold') +
theme_classic()
threshold_output %>%
filter(.metric == 'j_index') %>%
arrange(desc(.estimate))
# Distance v. threshold for not M
threshold_output %>%
filter(.metric == 'distance') %>%
ggplot(aes(x = .threshold, y = .estimate)) +
geom_line() +
labs(y = 'Distance', x = 'threshold') +
theme_classic()
threshold_output %>%
filter(.metric == 'distance') %>%
arrange(.estimate)
log_metrics <- metric_set(accuracy,sens,yardstick::spec)
final_output %>%
mutate(.pred_class = make_two_class_pred(.pred_B, levels(diagnosis), threshold = .64)) %>%
log_metrics(truth = diagnosis, estimate = .pred_class, event_level = 'second')
# Model Specification
rf_spec <- rand_forest() %>%
set_engine(engine = 'ranger') %>%
set_args(mtry = NULL, # size of random subset of variables; default is floor(sqrt(ncol(x)))
trees = 1000, # Number of trees
min_n = 2,
probability = FALSE, # FALSE: hard predictions
importance = 'impurity') %>%
set_mode('classification') # change this for regression tree
# Recipe
data_rec <- recipe(diagnosis ~ ., data = breastCa_Re_new2)
# Workflows
data_wf_mtry2 <- workflow() %>%
add_model(rf_spec %>% set_args(mtry = 2)) %>%
add_recipe(data_rec)
# Create workflows for mtry = 4 , 10, and 20
data_wf_mtry4 <- workflow() %>%
add_model(rf_spec %>% set_args(mtry = 4)) %>%
add_recipe(data_rec)
data_wf_mtry10 <- workflow() %>%
add_model(rf_spec %>% set_args(mtry = 10)) %>%
add_recipe(data_rec)
data_wf_mtry20 <- workflow() %>%
add_model(rf_spec %>% set_args(mtry = 20)) %>%
add_recipe(data_rec)
# Fit Models
set.seed(123) # make sure to run this before each fit so that you have the same 1000 trees
data_fit_mtry2 <- fit(data_wf_mtry2, data = breastCa_Re_new2)
set.seed(123)
data_fit_mtry4 <- fit(data_wf_mtry4, data = breastCa_Re_new2)
set.seed(123)
data_fit_mtry10 <- fit(data_wf_mtry10, data = breastCa_Re_new2)
set.seed(123)
data_fit_mtry20 <- fit(data_wf_mtry20, data = breastCa_Re_new2)
# Custom Function to get OOB predictions, true observed outcomes and add a model label
rf_OOB_output <- function(fit_model, model_label, truth){
tibble(
.pred_diagnosis = fit_model %>% extract_fit_engine() %>% pluck('predictions'), #OOB predictions
diagnosis = truth,
model = model_label
)
}
#check out the function output
rf_OOB_output(data_fit_mtry2,'mtry2', breastCa_Re_new2 %>% pull(diagnosis))
# Evaluate OOB Metrics
data_rf_OOB_output <- bind_rows(
rf_OOB_output(data_fit_mtry2,'mtry2', breastCa_Re_new2 %>% pull(diagnosis)),
rf_OOB_output(data_fit_mtry4,'mtry4', breastCa_Re_new2 %>% pull(diagnosis)),
rf_OOB_output(data_fit_mtry10,'mtry10', breastCa_Re_new2 %>% pull(diagnosis)),
rf_OOB_output(data_fit_mtry20,'mtry20', breastCa_Re_new2 %>% pull(diagnosis))
)
data_rf_OOB_output %>%
group_by(model) %>%
accuracy(truth = diagnosis, estimate = .pred_diagnosis)
data_rf_OOB_output %>%
group_by(model) %>%
accuracy(truth = diagnosis, estimate =.pred_diagnosis) %>%
mutate(mtry = as.numeric(stringr::str_replace(model,'mtry',''))) %>%
ggplot(aes(x = mtry, y = .estimate )) +
geom_point() +
geom_line() +
theme_classic()
data_fit_mtry2
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: rand_forest()
##
## -- Preprocessor ----------------------------------------------------------------
## 0 Recipe Steps
##
## -- Model -----------------------------------------------------------------------
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~2, x), num.trees = ~1000, min.node.size = min_rows(~2, x), probability = ~FALSE, importance = ~"impurity", num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
##
## Type: Classification
## Number of trees: 1000
## Sample size: 569
## Number of independent variables: 20
## Mtry: 2
## Target node size: 2
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error: 3.51 %
rf_OOB_output(data_fit_mtry2,'mtry2', breastCa_Re_new2 %>% pull(diagnosis)) %>%
conf_mat(truth = diagnosis, estimate= .pred_diagnosis)
## Truth
## Prediction B M
## B 351 14
## M 6 198
data_fit_mtry2 %>%
extract_fit_engine() %>%
vip(num_features = 30) + theme_classic()
ggplot(breastCa_Re_new2, aes(x = diagnosis, y = area_worst)) +
geom_violin() + theme_classic()
ggplot(breastCa_Re_new2, aes(x = diagnosis, y = fractal_dimension_mean)) +
geom_violin() + theme_classic()
#intermediate important
ggplot(breastCa_Re_new2, aes(x = diagnosis, y = perimeter_mean)) +
geom_violin() + theme_classic()
breastCa_Re<-breastCa %>%
drop_na() %>%
select(-c(13:22)) %>%
select(-1)
breastCa_Re_new<-breastCa_Re%>%
mutate(concave_points_mean=`concave points_mean`)%>%
select(-10)
ggplot(breastCa_Re_new, aes(x = perimeter_mean, y = `concave points_worst`)) +
geom_point() +
theme_classic()
# Select just the perimeter_mean and concave points_worst variables
breastCa_Re_new_sub <- breastCa_Re_new %>%
select(perimeter_mean, `concave points_worst`)
# Run k-means for k = centers = 2
set.seed(253)
kclust_k2 <- kmeans(breastCa_Re_new_sub, centers = 2)
# Display the cluster assignments
kclust_k2$cluster
## [1] 2 2 2 1 2 1 2 1 1 1 2 2 2 2 1 1 1 2 2 1 1 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1
## [38] 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1
## [75] 1 2 1 2 2 1 1 1 2 2 1 2 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1
## [112] 1 1 1 1 1 1 1 2 2 1 2 2 1 1 1 1 2 1 2 1 1 2 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 1 2 1 1 2 2 1 1 1 2 1 1 1 1 2 1 1 2 2 1 1 1
## [186] 1 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 2 1 1 1 1 2 1 1 2 1 2 2 1 1 1 1 2 2 1 1
## [223] 1 2 1 1 1 1 1 1 2 1 1 2 1 1 2 2 1 2 1 1 1 1 2 1 1 1 1 1 2 1 2 2 2 1 2 2 2
## [260] 2 2 2 2 1 2 2 1 1 1 1 1 1 2 1 2 1 1 2 1 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 1 1 2 2 2 1 1
## [334] 1 1 2 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1 2 2
## [371] 2 1 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 1 1 1 1 1 2 1 1 1 1 1 2
## [408] 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1 1 1 1 1 1 1 2 1 1
## [445] 2 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1
## [482] 1 1 1 2 1 1 2 1 2 1 2 2 1 1 1 1 1 2 2 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 2 2
## [519] 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [556] 1 1 1 1 1 1 1 2 2 2 2 2 2 1
# Add a variable (kclust_k2) to the original dataset
# containing the cluster assignments
breastCa_Re_new <- breastCa_Re_new %>%
mutate(kclust_2 = factor(kclust_k2$cluster))
# Visualize the cluster assignments on the original scatterplot
originalClusterPlot <- ggplot(
breastCa_Re_new,
aes(
x = perimeter_mean,
y = `concave points_worst`,
color = kclust_2,
text = paste('diagnosis: ', diagnosis)
)
) +
geom_point() +
theme_classic()
ggplotly(originalClusterPlot , tooltip = c( "text"))
# Run k-means on the *scaled* data (all variables have SD = 1)
set.seed(253)
kclust_k2_scale <- kmeans(scale(breastCa_Re_new_sub), centers = 2)
breastCa_Re_new <- breastCa_Re_new %>%
mutate(kclust_2_scale = factor(kclust_k2_scale$cluster))
# Visualize the new cluster assignments
scaledClusterPlot <- ggplot(
breastCa_Re_new,
aes(
x = perimeter_mean,
y = `concave points_worst`,
color = kclust_2,
text = paste('diagnosis: ', diagnosis)
)
) +
geom_point() +
theme_classic()
ggplotly(scaledClusterPlot , tooltip = c( "text"))
# Select the variables to be used in clustering
breastCa_Re_new_sub2 <- breastCa_Re_new %>%
select(c(2:21))
# Look at summary statistics of the 3 variables
summary(breastCa_Re_new_sub2)
## radius_mean texture_mean perimeter_mean area_mean
## Min. : 6.981 Min. : 9.71 Min. : 43.79 Min. : 143.5
## 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17 1st Qu.: 420.3
## Median :13.370 Median :18.84 Median : 86.24 Median : 551.1
## Mean :14.127 Mean :19.29 Mean : 91.97 Mean : 654.9
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10 3rd Qu.: 782.7
## Max. :28.110 Max. :39.28 Max. :188.50 Max. :2501.0
## smoothness_mean compactness_mean concavity_mean concave points_mean
## Min. :0.05263 Min. :0.01938 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956 1st Qu.:0.02031
## Median :0.09587 Median :0.09263 Median :0.06154 Median :0.03350
## Mean :0.09636 Mean :0.10434 Mean :0.08880 Mean :0.04892
## 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070 3rd Qu.:0.07400
## Max. :0.16340 Max. :0.34540 Max. :0.42680 Max. :0.20120
## fractal_dimension_mean radius_worst texture_worst perimeter_worst
## Min. :0.04996 Min. : 7.93 Min. :12.02 Min. : 50.41
## 1st Qu.:0.05770 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11
## Median :0.06154 Median :14.97 Median :25.41 Median : 97.66
## Mean :0.06280 Mean :16.27 Mean :25.68 Mean :107.26
## 3rd Qu.:0.06612 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40
## Max. :0.09744 Max. :36.04 Max. :49.54 Max. :251.20
## area_worst smoothness_worst compactness_worst concavity_worst
## Min. : 185.2 Min. :0.07117 Min. :0.02729 Min. :0.0000
## 1st Qu.: 515.3 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145
## Median : 686.5 Median :0.13130 Median :0.21190 Median :0.2267
## Mean : 880.6 Mean :0.13237 Mean :0.25427 Mean :0.2722
## 3rd Qu.:1084.0 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829
## Max. :4254.0 Max. :0.22260 Max. :1.05800 Max. :1.2520
## concave points_worst symmetry_worst fractal_dimension_worst
## Min. :0.00000 Min. :0.1565 Min. :0.05504
## 1st Qu.:0.06493 1st Qu.:0.2504 1st Qu.:0.07146
## Median :0.09993 Median :0.2822 Median :0.08004
## Mean :0.11461 Mean :0.2901 Mean :0.08395
## 3rd Qu.:0.16140 3rd Qu.:0.3179 3rd Qu.:0.09208
## Max. :0.29100 Max. :0.6638 Max. :0.20750
## concave_points_mean
## Min. :0.00000
## 1st Qu.:0.02031
## Median :0.03350
## Mean :0.04892
## 3rd Qu.:0.07400
## Max. :0.20120
set.seed(253)
kclust_k2_allvars <- kmeans(scale(breastCa_Re_new_sub2), centers = 2)
breastCa_Re_new <- breastCa_Re_new %>%
mutate(kclust_k2_allvars = factor(kclust_k2_allvars$cluster))
breastCa_Re_new %>%
count(diagnosis,kclust_k2_allvars)
breastCa_Re_new %>%
group_by(kclust_k2_allvars) %>%
summarize(across(c(2:21), mean))
# Data-specific function to cluster and calculate total within-cluster SS
breastCa_Re_new_cluster_ss <- function(k){
# Perform clustering
kclust <- kmeans(scale(breastCa_Re_new_sub2), centers = k)
# Return the total within-cluster sum of squares
return(kclust$tot.withinss)
}
tibble(
k = 1:20,
tot_wc_ss = purrr::map_dbl(1:20, breastCa_Re_new_cluster_ss)
) %>%
ggplot(aes(x = k, y = tot_wc_ss)) +
geom_point() +
geom_line()+
labs(x = "Number of clusters",y = 'Total within-cluster sum of squares') +
theme_classic()
# Run k-means for k = centers = 3
set.seed(253)
kclust_k3 <- kmeans(breastCa_Re_new_sub, centers = 3)
# Display the cluster assignments
kclust_k3$cluster
## [1] 3 3 3 1 3 2 3 2 2 2 2 2 3 2 2 2 2 2 3 2 2 1 2 3 2 3 2 3 2 3 3 1 3 3 2 2 2
## [38] 2 2 2 2 1 3 2 2 3 1 2 1 2 1 2 1 3 2 1 3 2 2 1 1 1 2 1 2 2 1 1 1 1 3 1 3 2
## [75] 1 2 2 3 3 2 1 2 3 3 1 3 2 3 1 2 2 2 2 2 2 3 1 1 1 2 2 1 1 1 1 2 1 1 3 1 1
## [112] 1 2 1 1 1 1 2 2 3 1 3 3 2 2 2 2 3 2 3 1 2 2 2 3 1 1 1 2 1 1 2 1 2 1 1 1 2
## [149] 2 2 2 1 1 1 2 1 3 2 1 1 1 3 3 1 3 2 1 2 3 2 1 2 2 1 1 1 1 2 1 1 3 3 2 1 2
## [186] 1 3 1 1 1 2 1 1 1 2 2 2 3 3 2 1 3 3 2 1 2 1 2 2 2 3 1 3 3 2 2 1 1 3 3 2 2
## [223] 1 2 2 2 1 2 1 2 3 1 1 3 1 2 3 3 2 3 2 1 1 2 3 1 2 2 1 1 3 1 3 3 3 2 3 2 2
## [260] 2 3 2 3 2 2 3 1 2 1 1 2 1 3 1 3 1 1 3 2 2 3 1 3 2 2 1 1 1 1 1 2 2 2 1 1 2
## [297] 1 1 2 1 3 1 3 1 1 1 2 1 2 2 1 2 1 1 1 1 1 3 1 1 1 3 2 3 1 1 2 1 2 2 2 2 1
## [334] 1 1 2 2 3 1 3 2 1 1 3 1 1 1 2 1 1 1 2 3 2 1 1 2 2 1 1 1 2 1 2 2 3 3 1 3 3
## [371] 2 2 3 3 2 2 1 2 2 1 1 1 1 1 2 2 1 2 1 3 1 1 2 3 1 2 2 2 1 1 3 1 2 2 1 1 2
## [408] 2 3 1 1 1 1 2 2 1 1 2 1 1 1 2 1 2 1 1 1 1 1 1 2 1 3 3 2 2 2 2 2 2 1 2 2 1
## [445] 3 1 3 2 2 3 1 3 1 2 1 2 1 2 2 1 2 3 2 1 2 2 2 1 3 1 1 1 2 1 1 2 2 2 1 2 1
## [482] 2 2 2 2 2 2 3 1 2 1 3 3 1 2 2 2 1 3 3 2 2 1 3 1 1 1 1 2 2 1 2 2 2 2 1 3 3
## [519] 2 2 1 3 1 2 1 1 2 1 2 1 1 1 2 3 1 3 2 1 1 1 1 2 2 2 2 2 1 1 1 1 1 1 1 1 2
## [556] 1 1 1 2 1 2 1 2 3 3 3 2 3 1
# Run k-means on the *scaled* data (all variables have SD = 1)
set.seed(253)
kclust_k3_scale <- kmeans(scale(breastCa_Re_new_sub), centers = 3)
breastCa_Re_new <- breastCa_Re_new %>%
mutate(kclust_3_scale = factor(kclust_k3_scale$cluster))
# Visualize the new cluster assignments
newClusterKPlot <- ggplot(
breastCa_Re_new,
aes(
x = perimeter_mean,
y = `concave points_worst`,
color = kclust_3_scale,
text = paste('diagnosis: ', diagnosis)
)
) +
geom_point() +
theme_classic()
ggplotly(newClusterKPlot , tooltip = c( "text"))
set.seed(253)
kclust_k3_allvars <- kmeans(scale(breastCa_Re_new_sub2), centers = 3)
#within clusters su of squares
kclust_k3_allvars
## K-means clustering with 3 clusters of sizes 370, 93, 106
##
## Cluster means:
## radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 1 -0.47294644 -0.2447179 -0.49151681 -0.46834420 -0.3393479
## 2 -0.01575391 0.2342339 0.05989892 -0.08895702 0.9405564
## 3 1.66467260 0.6486969 1.66311907 1.71283356 0.3593111
## compactness_mean concavity_mean concave points_mean fractal_dimension_mean
## 1 -0.5408815 -0.5826564 -0.5916723 -0.1707039
## 2 1.1632023 0.9455177 0.6854667 1.0983509
## 3 0.8674372 1.2042428 1.4638711 -0.3677942
## radius_worst texture_worst perimeter_worst area_worst smoothness_worst
## 1 -0.5121756 -0.2770056 -0.5286914 -0.49367809 -0.3671331
## 2 0.1171932 0.4474237 0.2045023 0.01879255 1.0989419
## 3 1.6849621 0.5743552 1.6660104 1.70672818 0.3173362
## compactness_worst concavity_worst concave points_worst symmetry_worst
## 1 -0.5208214 -0.5605965 -0.6036665 -0.3364717
## 2 1.3364447 1.2237501 0.9683800 1.0287464
## 3 0.6454203 0.8831314 1.2575212 0.2718973
## fractal_dimension_worst concave_points_mean
## 1 -0.37590256 -0.5916723
## 2 1.43420972 0.6854667
## 3 0.05379665 1.4638711
##
## Clustering vector:
## [1] 2 3 3 2 3 2 3 2 2 2 1 2 3 1 2 2 1 2 3 1 1 1 2 3 3 2 2 3 2 3 3 2 3 3 2 2 2
## [38] 1 1 2 1 2 3 2 2 3 1 2 1 1 1 1 1 3 1 1 3 2 1 1 1 1 2 1 2 2 1 1 2 1 3 1 2 1
## [75] 1 1 1 3 3 1 1 2 3 3 1 3 1 3 1 1 1 1 1 1 2 3 1 1 1 2 1 1 1 1 1 2 1 1 3 1 1
## [112] 1 2 1 1 1 1 2 2 1 1 3 3 1 1 1 1 3 2 3 1 2 2 1 3 1 1 1 2 1 1 1 1 1 1 1 2 1
## [149] 1 1 1 2 2 1 1 1 3 1 1 1 1 3 3 1 3 1 1 1 3 1 1 1 2 1 1 1 1 2 1 1 3 3 2 1 1
## [186] 1 3 1 1 1 2 1 1 2 2 1 2 1 3 2 1 3 3 2 1 1 1 1 2 1 3 1 3 1 2 2 1 1 3 3 1 1
## [223] 1 2 1 1 1 1 1 2 2 1 1 3 1 1 3 3 1 3 1 1 2 1 3 1 1 2 1 1 3 1 3 3 3 1 3 2 2
## [260] 2 3 1 3 1 3 3 1 1 1 1 1 1 3 1 1 1 1 1 1 1 3 1 3 2 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1 3 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 1 1 3 1 3 1 1 1 1 2 2 2 1 1
## [334] 1 1 3 1 3 1 3 1 1 1 3 1 1 1 1 1 1 1 2 3 2 1 1 1 1 1 1 1 1 1 1 1 3 3 1 3 3
## [371] 2 1 3 3 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 3 1 1 2 3 1 1 1 1 1 1 2 1 1 1 1 1 1
## [408] 1 3 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 3 3 1 2 1 1 1 1 1 3 1 1
## [445] 3 1 3 1 1 3 1 3 1 1 1 1 1 1 1 1 3 3 1 1 1 2 1 1 3 2 1 1 1 1 1 1 1 1 1 2 1
## [482] 1 1 1 1 2 1 3 1 1 1 1 3 1 1 1 2 1 3 3 1 2 1 3 2 2 1 1 1 2 1 1 2 1 1 1 3 3
## [519] 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 3 1 3 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [556] 1 1 1 1 1 1 1 2 3 3 3 1 3 1
##
## Within cluster sum of squares by cluster:
## [1] 2779.416 1314.407 1403.885
## (between_SS / total_SS = 51.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
breastCa_Re_new <- breastCa_Re_new %>%
mutate(kclust_k3_allvars = factor(kclust_k3_allvars$cluster))
breastCa_Re_new %>%
count(diagnosis,kclust_k3_allvars)
breastCa_Re_new %>%
group_by(kclust_k3_allvars) %>%
summarize(across(c(2:21), mean))